This document is used to generate a pilot test report for property value evaluation in Los Angeles County incorporating the spatial regression adjustments.
Importing Data
parcel <- st_read("D:\\Project Data\\Data_Viz project data\\plot_geo.geojson")
## Reading layer `plot_geo' from data source `D:\Project Data\Data_Viz project data\plot_geo.geojson' using driver `GeoJSON'
## Simple feature collection with 2216 features and 28 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -118.9452 ymin: 33.69692 xmax: -117.6552 ymax: 34.8233
## geographic CRS: WGS 84
selected_cols <- c("CT", "ZIPcode5", "TotalValue", "EffectiveYearBuilt", "SQFTmain",
"Bedrooms", "Bathrooms", "geometry")
parcel <- parcel[, selected_cols]
unemployment <- read.csv("D:\\Project Data\\Data_Viz project data\\ACS\\ACS_17_5YR_S2301_with_ann.csv")
unemployment <- unemployment[2:nrow(unemployment), c("GEO.display.label", "HC04_EST_VC01")]
colnames(unemployment) <- c("GEO.display.label", "unemployment")
unemployment$CT <- ""
for (i in seq(1, nrow(unemployment), 1)){
if (substr(unemployment$GEO.display.label[i], 18, 18) != ","){
unemployment[i, "CT"] <- paste0(substr(unemployment$GEO.display.label[i], 14, 17),
substr(unemployment$GEO.display.label[i], 19, 20))}
else{unemployment[i, "CT"] <- paste0(substr(unemployment$GEO.display.label[i], 14, 17), paste("00"))}
}
median_income = read.csv("D:\\Project Data\\Data_Viz project data\\ACS\\ACS_17_5YR_S1903_with_ann.csv")
median_income <- median_income[2:nrow(median_income), c("GEO.display.label", "HC03_EST_VC02")]
colnames(median_income) <- c("GEO.display.label", "median_income")
median_income$CT <- ""
for (i in seq(1, nrow(median_income), 1)){
if (substr(median_income$GEO.display.label[i], 18, 18) != ","){
median_income[i, "CT"] <- paste0(substr(median_income$GEO.display.label[i], 14, 17),
substr(median_income$GEO.display.label[i], 19, 20))}
else{
median_income[i, "CT"] <- paste0(substr(median_income$GEO.display.label[i], 14, 17), paste("00"))}
}
parcel <- merge(parcel, unemployment, by.x="CT", by.y="CT")
parcel <- merge(parcel, median_income, by.x="CT", by.y="CT")
parcel$ZIPcode5 <- NULL
parcel <- parcel[(parcel$EffectiveYearBuilt != 0) & (parcel$TotalValue != 0), ]
parcel$age <- 2018 - parcel$EffectiveYearBuilt
parcel$logvalue <- log(parcel$TotalValue)
parcel$median_income <- as.numeric(parcel$median_income)
parcel$unemployment <- as.numeric(parcel$unemployment)
Choropleth for the Property Values in Los Angeles
color_Q <- colorQuantile("YlGnBu", domain=parcel$TotalValue, n=9)
leaflet(parcel) %>%
addPolygons(stroke = FALSE, fillOpacity = .8, smoothFactor = 0.2, color = ~ color_Q(TotalValue)
) %>%
addTiles() %>%
addLegend("bottomleft", pal=color_Q, values=~TotalValue,
title = "property value",
opacity = 1
)
for (i in seq(0, 1, 0.11)){
print(paste0(paste(i), "th quantile of property value is $", paste(quantile(parcel$TotalValue, i))))
}
## [1] "0th quantile of property value is $48036"
## [1] "0.11th quantile of property value is $218459.465"
## [1] "0.22th quantile of property value is $249886.36"
## [1] "0.33th quantile of property value is $277168.295"
## [1] "0.44th quantile of property value is $307175.38"
## [1] "0.55th quantile of property value is $344193.1"
## [1] "0.66th quantile of property value is $392150.18"
## [1] "0.77th quantile of property value is $474087.3"
## [1] "0.88th quantile of property value is $614214.24"
## [1] "0.99th quantile of property value is $1394058.43"
OLS Regression for Property Value Evaluation \[PropertyValue_i = \beta_0 + \beta_1 MedianIncome_i + \beta_2 Unemployment_i + \beta_3 SquareFootage_i\] \[+ \beta_4 Age_i + \beta_5 Bedrooms_i + \beta_6 Bathrooms_i + u_i\] Where we use median income, unemployment rate, median house square footage, median house age, median number of bedrooms and median number of bathrooms in census tract i to predict the median property value in census tract i
LA_pred.ols <- lm(TotalValue ~ median_income + unemployment + SQFTmain + age + Bedrooms + Bathrooms, data=parcel, na.action=na.exclude)
summary(LA_pred.ols)
##
## Call:
## lm(formula = TotalValue ~ median_income + unemployment + SQFTmain +
## age + Bedrooms + Bathrooms, data = parcel, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3933467 -77969 -28171 46882 7574016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 384868.055 34790.672 11.062 < 2e-16 ***
## median_income -22.897 9.620 -2.380 0.0174 *
## unemployment 111.106 108.032 1.028 0.3038
## SQFTmain 156.959 2.697 58.196 < 2e-16 ***
## age -2915.140 378.093 -7.710 1.90e-14 ***
## Bedrooms -43025.456 9355.921 -4.599 4.49e-06 ***
## Bathrooms 23104.075 10808.143 2.138 0.0327 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 279100 on 2173 degrees of freedom
## Multiple R-squared: 0.6333, Adjusted R-squared: 0.6323
## F-statistic: 625.4 on 6 and 2173 DF, p-value: < 2.2e-16
In the above results, housing characteristics like house square footage, house age, number of bedrooms and number of bathrooms are very effective (statistically significant) in predicting property values. However, the high magnitudes and different signs for the coefficients of #bathrooms and #bedrooms are abnormal. This might be attributed to multicolinearity with house square footage. Therefore, we simplify the model into
\[PropertyValue_i = \beta_0 + \beta_1 MedianIncome_i + \beta_2 Unemployment_i + \beta_3 SquareFootage_i + \beta_4 Age_i + u_i\]
LA_pred.ols <- lm(TotalValue ~ median_income + unemployment + SQFTmain + age, data=parcel, na.action=na.exclude)
summary(LA_pred.ols)
##
## Call:
## lm(formula = TotalValue ~ median_income + unemployment + SQFTmain +
## age, data = parcel, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3882508 -78923 -32213 46169 7625693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 330710.699 30220.184 10.943 <2e-16 ***
## median_income -24.147 9.593 -2.517 0.0119 *
## unemployment 110.589 108.567 1.019 0.3085
## SQFTmain 157.640 2.643 59.635 <2e-16 ***
## age -3276.875 355.412 -9.220 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 280800 on 2175 degrees of freedom
## Multiple R-squared: 0.6287, Adjusted R-squared: 0.628
## F-statistic: 920.7 on 4 and 2175 DF, p-value: < 2.2e-16
Where we can observe that house characteristics are still effective in predicting property values that properties with less year of usage and larger space have higher property values. In contrast, we also see the negative effect in median income which might against our intuition. In order to correct the violation of OLS assumption due to the existance of spatial auto-correlation, spatial regression analysis is done in the following sector.
Take Queen Weight for neighbored geometries
LA_W.queen <- poly2nb(parcel, queen=TRUE)
LA_W <- nb2listw(LA_W.queen , style="W", zero.policy=TRUE)
Spatial Regression (Adjusting OLS using the weight derived from geographical adjacencies)
LA_W.sar <- lagsarlm(TotalValue ~ median_income + unemployment + SQFTmain + age, data=parcel,
LA_W, zero.policy=TRUE)
summary(LA_W.sar)
##
## Call:lagsarlm(formula = TotalValue ~ median_income + unemployment +
## SQFTmain + age, data = parcel, listw = LA_W, zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3860187.1 -51055.8 -8215.8 39365.4 7766350.1
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 171902.0823 27261.6617 6.3056 2.870e-10
## median_income -11.8236 7.5957 -1.5566 0.1196
## unemployment 11.4402 NA NA NA
## SQFTmain 152.9393 2.5155 60.7979 < 2.2e-16
## age -2707.2726 334.7542 -8.0873 6.661e-16
##
## Rho: 0.32118, LR test value: 200.14, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.021488
## z-value: 14.947, p-value: < 2.22e-16
## Wald statistic: 223.42, p-value: < 2.22e-16
##
## Log likelihood: -30339.38 for lag model
## ML residual variance (sigma squared): 7.0354e+10, (sigma: 265240)
## Number of observations: 2180
## Number of parameters estimated: 7
## AIC: 60693, (AIC for lm: 60891)
After the correction using the weight of spatial auto-correlation, coefficient of median income become insignificant and the magnitudes of coefficients of house characteristics slightly decrease. To conclude, the property values are basically driven by house properties, and the effects from demographic attributes are more ambiguous.
Global Moran’s I
lm.morantest(LA_pred.ols, LA_W, alternative="two.sided", zero.policy=TRUE)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = TotalValue ~ median_income + unemployment +
## SQFTmain + age, data = parcel, na.action = na.exclude)
## weights: LA_W
##
## Moran I statistic standard deviate = 21.453, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Observed Moran I Expectation Variance
## 0.2743221730 -0.0009394768 0.0001646315
Local Mora’s I
localm <- localmoran(parcel$TotalValue, LA_W)
temp_mi <- data.frame(id=parcel$CT, MI=data.frame(localm)$Ii)
plot_mi <- merge(parcel, temp_mi, by.x="CT", by.y="id")
local_mi <- colorQuantile("Blues", plot_mi$MI, n=9)
leaflet(plot_mi) %>%
addPolygons(stroke = FALSE, fillOpacity = .8, smoothFactor = 0.2, color = ~ local_mi(MI)
) %>%
addTiles() %>%
addLegend("bottomleft", pal=local_mi, values=~MI,
title = "local Moran's I",
opacity = 1
)
The above choropleth shows the distribution of clustering effect for the property values. Regions with darker colors have higher property value correlations with their neighbored regions.
Reference
PPHA38520 Multilevel GIS Applications in the Social Sciences Course Materials from Ned English, The University of Chicago
American Community Survey (https://factfinder.census.gov/faces/nav/jsf/pages/index.xhtml)
Assessor Parcels Data from Los Angeles County Open Data Portal (https://data.lacounty.gov/Parcel-/Assessor-Parcels-Data-2006-thru-2019/9trm-uz8i)